R"===(
#ifndef WOLF_AES_CL
#define WOLF_AES_CL

/* For Mesa clover support */
#ifdef cl_clang_storage_class_specifiers
#   pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
#endif

#ifdef cl_amd_media_ops
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
#else
/* taken from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops.txt
 * Build-in Function
 *     uintn  amd_bitalign (uintn src0, uintn src1, uintn src2)
 *   Description
 *     dst.s0 =  (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2.s0 & 31))
 *     similar operation applied to other components of the vectors.
 *
 * The implemented function is modified because the last is in our case always a scalar.
 * We can ignore the bitwise AND operation.
 */
inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
{
	uint2 result;
	result.s0 =  (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2));
	result.s1 =  (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2));
	return result;
}
#endif

#ifdef cl_amd_media_ops2
#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
#else
/* taken from: https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops2.txt
 *     Built-in Function:
 *     uintn amd_bfe (uintn src0, uintn src1, uintn src2)
 *   Description
 *     NOTE: operator >> below represent logical right shift
 *     offset = src1.s0 & 31;
 *     width = src2.s0 & 31;
 *     if width = 0
 *         dst.s0 = 0;
 *     else if (offset + width) < 32
 *         dst.s0 = (src0.s0 << (32 - offset - width)) >> (32 - width);
 *     else
 *         dst.s0 = src0.s0 >> offset;
 *     similar operation applied to other components of the vectors
 */
inline int amd_bfe(const uint src0, const uint offset, const uint width)
{
	/* casts are removed because we can implement everything as uint
	 * int offset = src1;
	 * int width = src2;
	 * remove check for edge case, this function is always called with
	 * `width==8`
	 * @code
	 *   if ( width == 0 )
	 *      return 0;
	 * @endcode
	 */
	if ( (offset + width) < 32u )
		return (src0 << (32u - offset - width)) >> (32u - width);

	return src0 >> offset;
}
#endif

// AES table - the other three are generated on the fly

static const __constant uint AES0_C[256] =
{
	0xA56363C6U, 0x847C7CF8U, 0x997777EEU, 0x8D7B7BF6U,
	0x0DF2F2FFU, 0xBD6B6BD6U, 0xB16F6FDEU, 0x54C5C591U,
	0x50303060U, 0x03010102U, 0xA96767CEU, 0x7D2B2B56U,
	0x19FEFEE7U, 0x62D7D7B5U, 0xE6ABAB4DU, 0x9A7676ECU,
	0x45CACA8FU, 0x9D82821FU, 0x40C9C989U, 0x877D7DFAU,
	0x15FAFAEFU, 0xEB5959B2U, 0xC947478EU, 0x0BF0F0FBU,
	0xECADAD41U, 0x67D4D4B3U, 0xFDA2A25FU, 0xEAAFAF45U,
	0xBF9C9C23U, 0xF7A4A453U, 0x967272E4U, 0x5BC0C09BU,
	0xC2B7B775U, 0x1CFDFDE1U, 0xAE93933DU, 0x6A26264CU,
	0x5A36366CU, 0x413F3F7EU, 0x02F7F7F5U, 0x4FCCCC83U,
	0x5C343468U, 0xF4A5A551U, 0x34E5E5D1U, 0x08F1F1F9U,
	0x937171E2U, 0x73D8D8ABU, 0x53313162U, 0x3F15152AU,
	0x0C040408U, 0x52C7C795U, 0x65232346U, 0x5EC3C39DU,
	0x28181830U, 0xA1969637U, 0x0F05050AU, 0xB59A9A2FU,
	0x0907070EU, 0x36121224U, 0x9B80801BU, 0x3DE2E2DFU,
	0x26EBEBCDU, 0x6927274EU, 0xCDB2B27FU, 0x9F7575EAU,
	0x1B090912U, 0x9E83831DU, 0x742C2C58U, 0x2E1A1A34U,
	0x2D1B1B36U, 0xB26E6EDCU, 0xEE5A5AB4U, 0xFBA0A05BU,
	0xF65252A4U, 0x4D3B3B76U, 0x61D6D6B7U, 0xCEB3B37DU,
	0x7B292952U, 0x3EE3E3DDU, 0x712F2F5EU, 0x97848413U,
	0xF55353A6U, 0x68D1D1B9U, 0x00000000U, 0x2CEDEDC1U,
	0x60202040U, 0x1FFCFCE3U, 0xC8B1B179U, 0xED5B5BB6U,
	0xBE6A6AD4U, 0x46CBCB8DU, 0xD9BEBE67U, 0x4B393972U,
	0xDE4A4A94U, 0xD44C4C98U, 0xE85858B0U, 0x4ACFCF85U,
	0x6BD0D0BBU, 0x2AEFEFC5U, 0xE5AAAA4FU, 0x16FBFBEDU,
	0xC5434386U, 0xD74D4D9AU, 0x55333366U, 0x94858511U,
	0xCF45458AU, 0x10F9F9E9U, 0x06020204U, 0x817F7FFEU,
	0xF05050A0U, 0x443C3C78U, 0xBA9F9F25U, 0xE3A8A84BU,
	0xF35151A2U, 0xFEA3A35DU, 0xC0404080U, 0x8A8F8F05U,
	0xAD92923FU, 0xBC9D9D21U, 0x48383870U, 0x04F5F5F1U,
	0xDFBCBC63U, 0xC1B6B677U, 0x75DADAAFU, 0x63212142U,
	0x30101020U, 0x1AFFFFE5U, 0x0EF3F3FDU, 0x6DD2D2BFU,
	0x4CCDCD81U, 0x140C0C18U, 0x35131326U, 0x2FECECC3U,
	0xE15F5FBEU, 0xA2979735U, 0xCC444488U, 0x3917172EU,
	0x57C4C493U, 0xF2A7A755U, 0x827E7EFCU, 0x473D3D7AU,
	0xAC6464C8U, 0xE75D5DBAU, 0x2B191932U, 0x957373E6U,
	0xA06060C0U, 0x98818119U, 0xD14F4F9EU, 0x7FDCDCA3U,
	0x66222244U, 0x7E2A2A54U, 0xAB90903BU, 0x8388880BU,
	0xCA46468CU, 0x29EEEEC7U, 0xD3B8B86BU, 0x3C141428U,
	0x79DEDEA7U, 0xE25E5EBCU, 0x1D0B0B16U, 0x76DBDBADU,
	0x3BE0E0DBU, 0x56323264U, 0x4E3A3A74U, 0x1E0A0A14U,
	0xDB494992U, 0x0A06060CU, 0x6C242448U, 0xE45C5CB8U,
	0x5DC2C29FU, 0x6ED3D3BDU, 0xEFACAC43U, 0xA66262C4U,
	0xA8919139U, 0xA4959531U, 0x37E4E4D3U, 0x8B7979F2U,
	0x32E7E7D5U, 0x43C8C88BU, 0x5937376EU, 0xB76D6DDAU,
	0x8C8D8D01U, 0x64D5D5B1U, 0xD24E4E9CU, 0xE0A9A949U,
	0xB46C6CD8U, 0xFA5656ACU, 0x07F4F4F3U, 0x25EAEACFU,
	0xAF6565CAU, 0x8E7A7AF4U, 0xE9AEAE47U, 0x18080810U,
	0xD5BABA6FU, 0x887878F0U, 0x6F25254AU, 0x722E2E5CU,
	0x241C1C38U, 0xF1A6A657U, 0xC7B4B473U, 0x51C6C697U,
	0x23E8E8CBU, 0x7CDDDDA1U, 0x9C7474E8U, 0x211F1F3EU,
	0xDD4B4B96U, 0xDCBDBD61U, 0x868B8B0DU, 0x858A8A0FU,
	0x907070E0U, 0x423E3E7CU, 0xC4B5B571U, 0xAA6666CCU,
	0xD8484890U, 0x05030306U, 0x01F6F6F7U, 0x120E0E1CU,
	0xA36161C2U, 0x5F35356AU, 0xF95757AEU, 0xD0B9B969U,
	0x91868617U, 0x58C1C199U, 0x271D1D3AU, 0xB99E9E27U,
	0x38E1E1D9U, 0x13F8F8EBU, 0xB398982BU, 0x33111122U,
	0xBB6969D2U, 0x70D9D9A9U, 0x898E8E07U, 0xA7949433U,
	0xB69B9B2DU, 0x221E1E3CU, 0x92878715U, 0x20E9E9C9U,
	0x49CECE87U, 0xFF5555AAU, 0x78282850U, 0x7ADFDFA5U,
	0x8F8C8C03U, 0xF8A1A159U, 0x80898909U, 0x170D0D1AU,
	0xDABFBF65U, 0x31E6E6D7U, 0xC6424284U, 0xB86868D0U,
	0xC3414182U, 0xB0999929U, 0x772D2D5AU, 0x110F0F1EU,
	0xCBB0B07BU, 0xFC5454A8U, 0xD6BBBB6DU, 0x3A16162CU
};

#define BYTE(x, y)	(amd_bfe((x), (y) << 3U, 8U))

uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, uint4 key)
{
	key.s0 ^= AES0[BYTE(X.s0, 0)];
	key.s1 ^= AES0[BYTE(X.s1, 0)];
	key.s2 ^= AES0[BYTE(X.s2, 0)];
	key.s3 ^= AES0[BYTE(X.s3, 0)];

	key.s0 ^= AES2[BYTE(X.s2, 2)];
	key.s1 ^= AES2[BYTE(X.s3, 2)];
	key.s2 ^= AES2[BYTE(X.s0, 2)];
	key.s3 ^= AES2[BYTE(X.s1, 2)];

	key.s0 ^= AES1[BYTE(X.s1, 1)];
	key.s1 ^= AES1[BYTE(X.s2, 1)];
	key.s2 ^= AES1[BYTE(X.s3, 1)];
	key.s3 ^= AES1[BYTE(X.s0, 1)];

	key.s0 ^= AES3[BYTE(X.s3, 3)];
	key.s1 ^= AES3[BYTE(X.s0, 3)];
	key.s2 ^= AES3[BYTE(X.s1, 3)];
	key.s3 ^= AES3[BYTE(X.s2, 3)];

	return key;
}

uint4 AES_Round2(const __local uint *AES0, const __local uint *AES1, const uint4 X, uint4 key)
{
	key.s0 ^= AES0[BYTE(X.s0, 0)];
	key.s1 ^= AES0[BYTE(X.s1, 0)];
	key.s2 ^= AES0[BYTE(X.s2, 0)];
	key.s3 ^= AES0[BYTE(X.s3, 0)];

	key.s0 ^= rotate(AES0[BYTE(X.s2, 2)] ^ AES1[BYTE(X.s3, 3)], 16u);
	key.s1 ^= rotate(AES0[BYTE(X.s3, 2)] ^ AES1[BYTE(X.s0, 3)], 16u);
	key.s2 ^= rotate(AES0[BYTE(X.s0, 2)] ^ AES1[BYTE(X.s1, 3)], 16u);
	key.s3 ^= rotate(AES0[BYTE(X.s1, 2)] ^ AES1[BYTE(X.s2, 3)], 16u);

	key.s0 ^= AES1[BYTE(X.s1, 1)];
	key.s1 ^= AES1[BYTE(X.s2, 1)];
	key.s2 ^= AES1[BYTE(X.s3, 1)];
	key.s3 ^= AES1[BYTE(X.s0, 1)];

	return key;
}

uint4 AES_Round2_bittube2(const __local uint *AES0, const __local uint *AES1, uint4 X, uint4 key)
{
	key.s0 ^= AES0[BYTE(X.s0, 0)] ^ rotate(AES0[BYTE(X.s2, 2)] ^ AES1[BYTE(X.s3, 3)], 16u) ^ AES1[BYTE(X.s1, 1)];
	X.s0 ^= key.s0;
	key.s1 ^= AES0[BYTE(X.s1, 0)] ^ rotate(AES0[BYTE(X.s3, 2)] ^ AES1[BYTE(X.s0, 3)], 16u) ^  AES1[BYTE(X.s2, 1)];
	X.s1 ^= key.s1;
	key.s2 ^= AES0[BYTE(X.s2, 0)] ^ rotate(AES0[BYTE(X.s0, 2)] ^ AES1[BYTE(X.s1, 3)], 16u) ^ AES1[BYTE(X.s3, 1)];
	X.s2 ^= key.s2;
	key.s3 ^= AES0[BYTE(X.s3, 0)] ^ rotate(AES0[BYTE(X.s1, 2)] ^ AES1[BYTE(X.s2, 3)], 16u) ^ AES1[BYTE(X.s0, 1)];

	return key;
}

#endif

)==="
